In [38]:
# GENDER AGENDAS MAPPER
# V3 - July 2025
# Developed by the Gender Justice Data Hub
# CC BY-NC-SA 4.0, Global Fund for Women
In [3]:
# Uninstall everything related
!pip uninstall -y torch torchvision torchaudio transformers sentence-transformers bertopic umap-learn hdbscan accelerate bitsandbytes xformers

# Clear pip cache
!pip cache purge

# First install PyTorch with CUDA
!pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124

# Then the base dependencies
!pip install transformers==4.35.2 accelerate bitsandbytes

# Install sentence-transformers before BERTopic
!pip install sentence-transformers

# Finally install BERTopic and its dependencies
!pip install bertopic umap-learn hdbscan adjustText
Found existing installation: torch 2.2.2
Uninstalling torch-2.2.2:
  Successfully uninstalled torch-2.2.2
WARNING: Skipping torchvision as it is not installed.
WARNING: Skipping torchaudio as it is not installed.
Found existing installation: transformers 4.53.2
Uninstalling transformers-4.53.2:
  Successfully uninstalled transformers-4.53.2
Found existing installation: sentence-transformers 5.0.0
Uninstalling sentence-transformers-5.0.0:
  Successfully uninstalled sentence-transformers-5.0.0
Found existing installation: bertopic 0.17.3
Uninstalling bertopic-0.17.3:
  Successfully uninstalled bertopic-0.17.3
Found existing installation: umap-learn 0.5.9.post2
Uninstalling umap-learn-0.5.9.post2:
  Successfully uninstalled umap-learn-0.5.9.post2
Found existing installation: hdbscan 0.8.40
Uninstalling hdbscan-0.8.40:
  Successfully uninstalled hdbscan-0.8.40
Found existing installation: accelerate 1.8.1
Uninstalling accelerate-1.8.1:
  Successfully uninstalled accelerate-1.8.1
Found existing installation: bitsandbytes 0.42.0
Uninstalling bitsandbytes-0.42.0:
  Successfully uninstalled bitsandbytes-0.42.0
WARNING: Skipping xformers as it is not installed.
Files removed: 104
Looking in indexes: https://download.pytorch.org/whl/cu124
ERROR: Could not find a version that satisfies the requirement torch==2.6.0 (from versions: none)
ERROR: No matching distribution found for torch==2.6.0
Collecting transformers==4.35.2
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.5/123.5 kB 3.1 MB/s eta 0:00:00
Collecting accelerate
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (3.13.1)
Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (0.33.4)
Requirement already satisfied: numpy>=1.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (1.26.4)
Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (23.1)
Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (6.0.1)
Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (2023.10.3)
Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (2.32.3)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.35.2)
  Downloading tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl.metadata (6.7 kB)
Requirement already satisfied: safetensors>=0.3.1 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (0.5.2)
Requirement already satisfied: tqdm>=4.27 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (4.65.0)
Requirement already satisfied: psutil in /opt/anaconda3/lib/python3.11/site-packages (from accelerate) (5.9.0)
Collecting torch>=2.0.0 (from accelerate)
  Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.11/site-packages (from bitsandbytes) (1.11.4)
Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->transformers==4.35.2) (2023.6.0)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->transformers==4.35.2) (4.14.0)
Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->transformers==4.35.2) (1.1.5)
Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=2.0.0->accelerate) (1.12)
Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=2.0.0->accelerate) (3.1)
Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=2.0.0->accelerate) (3.1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (2025.4.26)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=2.0.0->accelerate) (2.1.3)
Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=2.0.0->accelerate) (1.3.0)
Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.9/7.9 MB 7.8 MB/s eta 0:00:0000:0100:01m
Downloading accelerate-1.8.1-py3-none-any.whl (365 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 365.3/365.3 kB 10.6 MB/s eta 0:00:00
Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 105.0/105.0 MB 9.0 MB/s eta 0:00:0000:01m00:01
Downloading tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl (2.6 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.6/2.6 MB 9.0 MB/s eta 0:00:0000:0100:01m
Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl (150.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.8/150.8 MB 10.1 MB/s eta 0:00:0000:0100:01
Installing collected packages: torch, bitsandbytes, tokenizers, accelerate, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.2
    Uninstalling tokenizers-0.21.2:
      Successfully uninstalled tokenizers-0.21.2
Successfully installed accelerate-1.8.1 bitsandbytes-0.42.0 tokenizers-0.15.2 torch-2.2.2 transformers-4.35.2
Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.9/40.9 kB 2.2 MB/s eta 0:00:00
Requirement already satisfied: tqdm in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.65.0)
Requirement already satisfied: torch>=1.11.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (2.2.2)
Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.6.1)
Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.11.4)
Requirement already satisfied: huggingface-hub>=0.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (0.33.4)
Requirement already satisfied: Pillow in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (10.2.0)
Requirement already satisfied: typing_extensions>=4.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.14.0)
Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (3.13.1)
Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2023.6.0)
Requirement already satisfied: packaging>=20.9 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (23.1)
Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.1)
Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3)
Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (1.1.5)
Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (1.12)
Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1)
Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.3)
Requirement already satisfied: numpy>=1.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (1.26.4)
Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (2023.10.3)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-macosx_10_12_x86_64.whl.metadata (6.8 kB)
Requirement already satisfied: safetensors>=0.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.5.2)
Requirement already satisfied: joblib>=1.2.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (1.2.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (3.5.0)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (2.1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2025.4.26)
Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=1.11.0->sentence-transformers) (1.3.0)
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 470.2/470.2 kB 9.6 MB/s eta 0:00:00ta 0:00:01
Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.8/10.8 MB 12.5 MB/s eta 0:00:0000:0100:01
Downloading tokenizers-0.21.2-cp39-abi3-macosx_10_12_x86_64.whl (2.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.9/2.9 MB 11.2 MB/s eta 0:00:0000:0100:01
Installing collected packages: tokenizers, transformers, sentence-transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed sentence-transformers-5.0.0 tokenizers-0.21.2 transformers-4.53.2
Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting umap-learn
  Downloading umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting hdbscan
  Downloading hdbscan-0.8.40-cp311-cp311-macosx_10_9_universal2.whl.metadata (15 kB)
Requirement already satisfied: adjustText in /opt/anaconda3/lib/python3.11/site-packages (1.3.0)
Requirement already satisfied: numpy>=1.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (1.26.4)
Requirement already satisfied: pandas>=1.1.5 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (2.1.4)
Requirement already satisfied: plotly>=4.7.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (5.9.0)
Requirement already satisfied: scikit-learn>=1.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (1.6.1)
Requirement already satisfied: sentence-transformers>=0.4.1 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (5.0.0)
Requirement already satisfied: tqdm>=4.41.1 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (4.65.0)
Requirement already satisfied: llvmlite>0.36.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (0.42.0)
Requirement already satisfied: scipy>=1.3.1 in /opt/anaconda3/lib/python3.11/site-packages (from umap-learn) (1.11.4)
Requirement already satisfied: numba>=0.51.2 in /opt/anaconda3/lib/python3.11/site-packages (from umap-learn) (0.59.0)
Requirement already satisfied: pynndescent>=0.5 in /opt/anaconda3/lib/python3.11/site-packages (from umap-learn) (0.5.13)
Requirement already satisfied: joblib>=1.0 in /opt/anaconda3/lib/python3.11/site-packages (from hdbscan) (1.2.0)
Requirement already satisfied: matplotlib in /opt/anaconda3/lib/python3.11/site-packages (from adjustText) (3.8.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.11/site-packages (from pandas>=1.1.5->bertopic) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.11/site-packages (from pandas>=1.1.5->bertopic) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in /opt/anaconda3/lib/python3.11/site-packages (from pandas>=1.1.5->bertopic) (2023.3)
Requirement already satisfied: tenacity>=6.2.0 in /opt/anaconda3/lib/python3.11/site-packages (from plotly>=4.7.0->bertopic) (8.2.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn>=1.0->bertopic) (3.5.0)
Requirement already satisfied: transformers<5.0.0,>=4.41.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (4.53.2)
Requirement already satisfied: torch>=1.11.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (2.2.2)
Requirement already satisfied: huggingface-hub>=0.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (0.33.4)
Requirement already satisfied: Pillow in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (10.2.0)
Requirement already satisfied: typing_extensions>=4.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (4.14.0)
Requirement already satisfied: contourpy>=1.0.1 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (1.2.0)
Requirement already satisfied: cycler>=0.10 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (1.4.4)
Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (23.1)
Requirement already satisfied: pyparsing>=2.3.1 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (3.0.9)
Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (3.13.1)
Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2023.6.0)
Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (6.0.1)
Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.32.3)
Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (1.1.5)
Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas>=1.1.5->bertopic) (1.16.0)
Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (1.12)
Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (3.1)
Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (3.1.3)
Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (2023.10.3)
Requirement already satisfied: tokenizers<0.22,>=0.21 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (0.21.2)
Requirement already satisfied: safetensors>=0.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (0.5.2)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (2.1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2025.4.26)
Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (1.3.0)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 153.0/153.0 kB 5.7 MB/s eta 0:00:00
Downloading umap_learn-0.5.9.post2-py3-none-any.whl (90 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 90.1/90.1 kB 7.1 MB/s eta 0:00:00
Downloading hdbscan-0.8.40-cp311-cp311-macosx_10_9_universal2.whl (1.5 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.5/1.5 MB 11.8 MB/s eta 0:00:0000:0100:01
Installing collected packages: hdbscan, umap-learn, bertopic
Successfully installed bertopic-0.17.3 hdbscan-0.8.40 umap-learn-0.5.9.post2
In [4]:
!pip install openai --upgrade
Requirement already satisfied: openai in /opt/anaconda3/lib/python3.11/site-packages (1.88.0)
Collecting openai
  Downloading openai-1.95.1-py3-none-any.whl.metadata (29 kB)
Requirement already satisfied: anyio<5,>=3.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (4.2.0)
Requirement already satisfied: distro<2,>=1.7.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (1.8.0)
Requirement already satisfied: httpx<1,>=0.23.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (0.28.1)
Requirement already satisfied: jiter<1,>=0.4.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (0.10.0)
Requirement already satisfied: pydantic<3,>=1.9.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (2.8.2)
Requirement already satisfied: sniffio in /opt/anaconda3/lib/python3.11/site-packages (from openai) (1.3.0)
Requirement already satisfied: tqdm>4 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (4.65.0)
Requirement already satisfied: typing-extensions<5,>=4.11 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (4.14.0)
Requirement already satisfied: idna>=2.8 in /opt/anaconda3/lib/python3.11/site-packages (from anyio<5,>=3.5.0->openai) (3.10)
Requirement already satisfied: certifi in /opt/anaconda3/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (2025.4.26)
Requirement already satisfied: httpcore==1.* in /opt/anaconda3/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (1.0.7)
Requirement already satisfied: h11<0.15,>=0.13 in /opt/anaconda3/lib/python3.11/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0)
Requirement already satisfied: annotated-types>=0.4.0 in /opt/anaconda3/lib/python3.11/site-packages (from pydantic<3,>=1.9.0->openai) (0.6.0)
Requirement already satisfied: pydantic-core==2.20.1 in /opt/anaconda3/lib/python3.11/site-packages (from pydantic<3,>=1.9.0->openai) (2.20.1)
Downloading openai-1.95.1-py3-none-any.whl (755 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 755.6/755.6 kB 11.2 MB/s eta 0:00:00a 0:00:01
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.88.0
    Uninstalling openai-1.88.0:
      Successfully uninstalled openai-1.88.0
Successfully installed openai-1.95.1
In [5]:
!pip install polars-lts-cpu
Requirement already satisfied: polars-lts-cpu in /opt/anaconda3/lib/python3.11/site-packages (1.31.0)
In [13]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

from bertopic import BERTopic
print("BERTopic imported successfully")

from sentence_transformers import SentenceTransformer
print("SentenceTransformers is working")

from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

import openai
PyTorch version: 2.2.2
CUDA available: False
BERTopic imported successfully
SentenceTransformers is working
In [17]:
import pandas as pd
import re
import torch
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

from bertopic import BERTopic
from bertopic.representation import OpenAI, KeyBERTInspired, MaximalMarginalRelevance

# Load API key from .env
from dotenv import load_dotenv
import os
import openai

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

print("Packages loaded successfully.")
Packages loaded successfully.
[nltk_data] Downloading package stopwords to /Users/Condi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [19]:
import openai
from dotenv import load_dotenv
import os

# Load API key from .env file
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# Create OpenAI client for v1.x
client = openai.OpenAI(api_key=api_key)

# Load topic representation
from bertopic.representation import OpenAI as OpenAI_Representation

representation_model = OpenAI_Representation(
    client=client,
    model="gpt-4o",
    delay_in_seconds=10
)

# Prompt
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short topic label in the following format:
topic: <topic label>
"""
In [21]:
import pandas as pd

file_list = [
    "Input/gender_Apr25-1.csv"
]

# Read, convert date, and filter in a single line per file
dfs = []
for f in file_list:
    df_temp = pd.read_csv(f)
    df_temp["event_date"] = pd.to_datetime(df_temp["event_date"])
    df_temp = df_temp[df_temp["event_date"] >= "2018-01-01"]
    dfs.append(df_temp)

# Concatenate all filtered DataFrames
df = pd.concat(dfs, ignore_index=True)
In [23]:
print("Number of rows after filtering:", len(df))
print("Dates:", df["event_date"].min(), "→", df["event_date"].max())
Number of rows after filtering: 78145
Dates: 2018-01-01 00:00:00 → 2025-04-25 00:00:00
In [25]:
import polars as pl

pl_df = pl.from_pandas(df)
pl_df
Out[25]:
shape: (78_145, 31)
event_id_cntyevent_dateyeartime_precisiondisorder_typeevent_typesub_event_typeactor1assoc_actor_1inter1actor2assoc_actor_2inter2interactioncivilian_targetingisoregioncountryadmin1admin2admin3locationlatitudelongitudegeo_precisionsourcesource_scalenotesfatalitiestagstimestamp
strdatetime[ns]i64i64strstrstrstrstrstrstrstrstrstrstri64strstrstrstrstrstrf64f64i64strstrstri64stri64
"ARG16601"2025-04-25 00:00:0020251"Demonstrations""Protests""Peaceful protest""Protesters (Argentina)""Women (Argentina)""Protesters"nullnullnull"Protesters only"null32"South America""Argentina""Cordoba""Punilla"null"Capilla del Monte"-30.8568-64.52581"El Diario de Carlos Paz""Subnational""On 25 April 2025, in Capilla d…0"crowd size=large"1745881584
"BRA96908"2025-04-25 00:00:0020252"Political violence""Violence against civilians""Attack""CV: Red Command"null"Political militia""Civilians (Brazil)""Women (Brazil)""Civilians""Political militia-Civilians""Civilian targeting"76"South America""Brazil""Bahia""Salvador"null"Salvador"-12.9711-38.51081"Alo Juca; Bnews (Brazil)""Subnational-National""Around 25 April 2025 (as repor…1"women targeted: girls"1745881585
"ISR45719"2025-04-25 00:00:0020251"Demonstrations""Protests""Peaceful protest""Protesters (Israel)""Shift 101; Women (Israel)""Protesters"nullnullnull"Protesters only"null376"Middle East""Israel""Jerusalem""Jerusalem""Judean Mountains""Jerusalem"31.76935.21631"Haaretz""National""On 25 April 2025, about 200 Is…0"crowd size=about 200"1745881590
"MEX103000"2025-04-25 00:00:0020251"Political violence""Violence against civilians""Attack""Unidentified Armed Group (Mexi…null"Political militia""Civilians (Mexico)""Labor Group (Mexico); Women (M…"Civilians""Political militia-Civilians""Civilian targeting"484"North America""Mexico""Guanajuato""Leon"null"Leon de los Aldama"21.122-101.68321"Zona Franca""Subnational""On 25 April 2025, in Leon de l…1null1745881592
"MEX103223"2025-04-25 00:00:0020251"Political violence""Violence against civilians""Attack""Unidentified Gang (Mexico)"null"Political militia""Civilians (Mexico)""Women (Mexico)""Civilians""Political militia-Civilians""Civilian targeting"484"North America""Mexico""Veracruz de Ignacio de la Llav…"Coxquihui"null"Sabanas de Xalostoc"20.2216-97.53491"Imagen del Golfo""Subnational""On 25 April 2025, in Sabanas d…2"women targeted: relatives of t…1745881593
…………………………………………………………………………………
"VEN8468"2018-01-01 00:00:0020182"Political violence; Demonstrat…"Protests""Excessive force against protes…"Protesters (Venezuela)""Women (Venezuela)""Protesters""Military Forces of Venezuela (…null"State forces""State forces-Protesters""Civilian targeting"862"South America""Venezuela""Distrito Capital""Libertador"null"Caracas - Libertador"10.5127-66.91291"Venezuelanalysis.com""International""Around 1 January 2018 (as repo…1"crowd size=no report; women ta…1675724415
"DRC11890"2018-01-01 00:00:0020181"Political violence""Violence against civilians""Attack""Nyatura Militia (Kasongo)"null"Identity militia""Civilians (Democratic Republic…"Women (Democratic Republic of …"Civilians""Identity militia-Civilians""Civilian targeting"180"Middle Africa""Democratic Republic of Congo""Nord-Kivu""Walikale""Walikale""Walikale"-1.42828.0731"Kivu Security Tracker""Other""On 1 January 2018, an 18 year …1null1694477354
"MEX26708"2018-01-01 00:00:0020183"Political violence""Violence against civilians""Sexual violence""Police Forces of Mexico (2012-…null"State forces""Civilians (Mexico)""Women (Mexico)""Civilians""State forces-Civilians""Civilian targeting"484"North America""Mexico""Nuevo Leon""San Pedro Garza Garcia"null"San Pedro Garza Garcia"25.6652-100.40251"El Norte""Subnational""On January 2018 in San Pedro G…0null1702344316
"DRC11889"2018-01-01 00:00:0020181"Political violence""Violence against civilians""Sexual violence""Unidentified Armed Group (Demo…null"Political militia""Civilians (Democratic Republic…"Women (Democratic Republic of …"Civilians""Political militia-Civilians""Civilian targeting"180"Middle Africa""Democratic Republic of Congo""Sud-Kivu""Uvira""Uvira""Mulima"-3.381929.13951"Kivu Security Tracker""Other""On 1 January 2018, unidentifie…0"women targeted: girls"1705978242
"SYR18080"2018-01-01 00:00:0020181"Political violence""Violence against civilians""Attack""Islamic State in Iraq and the …null"Rebel group""Civilians (Syria)""Prisoners (Syria); Women (Syri…"Civilians""Rebel group-Civilians""Civilian targeting"760"Middle East""Syria""Deir ez Zor""Deir ez Zor""Deir ez Zor""Deir-ez-Zor"35.331940.14613"SNHR; SOHR""Local partner-Other""A woman was executed by Islami…1null1730157026
In [27]:
import polars as pl
import pandas as pd
import re

# Get unique list of places and create the pattern
place_cols = ["country", "location", "admin1", "admin2", "admin3"]
place_series = [pl_df[col].drop_nulls().unique() for col in place_cols]
places = set()
for s in place_series:
    places.update(s.to_list())

places = [l.strip() for l in places if isinstance(l, str)]
pattern = r'\b(' + '|'.join(map(re.escape, places)) + r')\b'

# Apply cleaning directly in Polars
pl_df_clean = pl_df.with_columns(
    pl.col("notes")
      .cast(pl.String)
      .str.replace_all(pattern, "")
      .str.replace_all(r"\b(19|20)\d{2}\b", "")
      .alias("notes_clean")
)

# Extract the final list
documents = pl_df_clean["notes_clean"].drop_nulls().to_list()
In [29]:
titles = df["event_id_cnty"].dropna().tolist()
In [31]:
import pandas as pd
import polars as pl

def glimpse_polars(df: pl.DataFrame, max_cols=100, max_rows=5):
    
    print(f"Observations: {df.height:,}")
    print(f"Variables: {df.width:,}")
    print("-" * 100)

    col_info = []
    for i, col_name in enumerate(df.columns):
        if i < max_cols:
            col_series = df.get_column(col_name)
            dtype = col_series.dtype
            non_null_count = col_series.len() - col_series.null_count()
            unique_count = col_series.n_unique()
            
            sample_values = col_series.drop_nulls().unique().slice(0, max_rows).to_list()

            col_info.append({
                "Variable": col_name,
                "Type": dtype,
                "Non-Null": f"{non_null_count:,}",
                "Unique": f"{unique_count:,}",
                "Sample Values": sample_values
            })

    col_info_df = pd.DataFrame(col_info)
    print(col_info_df.to_string(index=False, max_colwidth=100))

    if df.width > max_cols:
        print(f"\n... and {df.width - max_cols} more variables")
    print("-" * 100)

glimpse_polars(pl_df_clean)
Observations: 78,145
Variables: 32
----------------------------------------------------------------------------------------------------
          Variable                                     Type Non-Null Unique                                                                                        Sample Values
     event_id_cnty                                   String   78,145 78,145                                                    [YEM96399, IND52439, TUR9336, IND132953, PER1864]
        event_date Datetime(time_unit='ns', time_zone=None)   78,145  2,672 [2018-01-01 00:00:00, 2018-01-02 00:00:00, 2018-01-03 00:00:00, 2018-01-04 00:00:00, 2018-01-05 0...
              year                                    Int64   78,145      8                                                                       [2018, 2019, 2020, 2021, 2022]
    time_precision                                    Int64   78,145      3                                                                                            [1, 2, 3]
     disorder_type                                   String   78,145      4     [Strategic developments, Political violence, Political violence; Demonstrations, Demonstrations]
        event_type                                   String   78,145      5    [Explosions/Remote violence, Strategic developments, Violence against civilians, Riots, Protests]
    sub_event_type                                   String   78,145     16                       [Attack, Mob violence, Other, Peaceful protest, Remote explosive/landmine/IED]
            actor1                                   String   78,145  1,760 [Police Forces of Peru (2020-2020), Police Forces of Burundi (2005-), Police Forces of Indonesia ...
     assoc_actor_1                                   String   58,855 12,468 [Chippewa Tribal Group (United States); DEM: Democratic Party; Government of the United States (2...
            inter1                                   String   78,145      8                            [Political militia, Identity militia, Protesters, Rebel group, Civilians]
            actor2                                   String   26,328    560 [Civilians (Haiti), Private Security Forces (Moldova), Civilians (Guinea-Bissau), Civilians (Taiw...
     assoc_actor_2                                   String   23,284  3,483 [Farmers (Colombia); Women (Colombia), Women (East Timor), Journalists (Turkmenistan); Women (Tur...
            inter2                                   String   26,328      8                        [External/Other forces, State forces, Protesters, Rioters, Political militia]
       interaction                                   String   78,145     22 [Rebel group-Civilians, Rebel group-Rioters, Protesters only, Rioters-Civilians, Political militi...
civilian_targeting                                   String   20,541      2                                                                                 [Civilian targeting]
               iso                                    Int64   78,145    195                                                                                    [0, 4, 8, 12, 20]
            region                                   String   78,145     16                                          [Oceania, Caribbean, Europe, Western Africa, South America]
           country                                   String   78,145    195                       [Spain, Bosnia and Herzegovina, Honduras, Malta, Democratic Republic of Congo]
            admin1                                   String   78,145  2,118                                                           [Baringo, Loreto, Grodno, Mayabeque, Loja]
            admin2                                   String   76,250  9,590                                         [Edirne, Bushehr, Sao Jose dos Quatro Marcos, Atrato, Bangi]
            admin3                                   String   29,184  5,927                                                          [Midebdo, Kopera, Tirioko, Jamalpur, Dimow]
          location                                   String   78,145 19,306                                            [Tlacuilotepec, Ciales, Hunucma, Ah Htet Ngar Nan, Sanok]
          latitude                                  Float64   78,145 19,693                                                   [-54.8062, -54.5119, -53.7865, -53.1548, -51.7308]
         longitude                                  Float64   78,145 19,856                                              [-171.7553, -161.7558, -159.7804, -159.3721, -158.4575]
     geo_precision                                    Int64   78,145      3                                                                                            [1, 2, 3]
            source                                   String   78,145 15,326 [Aydinlik; Bianet; Evrensel; Sendika.org, N12, Daily Independent (Nigeria); Daily Trust (Nigeria)...
      source_scale                                   String   78,145     26             [New media-International, Other-New media, Regional, New media-Regional, Other-National]
             notes                                   String   78,145 77,911 [On 16 August 2019, teachers and their families including women staged a protest in Agartala city...
        fatalities                                    Int64   78,145     47                                                                                      [0, 1, 2, 3, 4]
              tags                                   String   62,222  2,692 [crowd size=more than 530, crowd size=10,000 - 20,000, crowd size=approximately 35, counter-demon...
         timestamp                                    Int64   78,145 14,517                                         [1559160369, 1559160524, 1559160525, 1559160527, 1559160529]
       notes_clean                                   String   78,145 72,931 [On 13 October , women (likely ) staged a protest in  district (coded to  town, ) and blocked a c...
----------------------------------------------------------------------------------------------------
In [33]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI

# Embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

# UMAP for dimensionality reduction (5D for clustering)
umap_model = UMAP(
    n_neighbors=40,
    n_components=5,
    min_dist=0.4,
    metric='cosine',
    random_state=42
)

# HDBSCAN for clustering
hdbscan_model = HDBSCAN(
    min_cluster_size=40,
    min_samples=15,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

# Dimensionality reduction for visualization (2D)
reduced_embeddings = UMAP(
    n_neighbors=15,
    n_components=2,
    min_dist=0.0,
    metric='cosine',
    random_state=42
).fit_transform(embeddings)

# Stopwords and vectorization
stopwords_total = set(stopwords.words("spanish") + stopwords.words("english"))
vectorizer_model = CountVectorizer(stop_words=list(stopwords_total))

# Semantic representations
representation_model = {
    "KeyBERT": KeyBERTInspired(),
    "MMR": MaximalMarginalRelevance(diversity=0.3),
    "OpenAI": OpenAI(
        client=client,
        model="gpt-4o",
        prompt=prompt
    )
}
Batches:   0%|          | 0/2443 [00:00<?, ?it/s]
In [36]:
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    top_n_words=10,
    verbose=True
)

# Fit the model and transform the documents
topics, probs = topic_model.fit_transform(documents, embeddings)
2025-07-14 18:09:35,917 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-14 18:11:11,761 - BERTopic - Dimensionality - Completed ✓
2025-07-14 18:11:11,763 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2025-07-14 18:11:15,792 - BERTopic - Cluster - Completed ✓
2025-07-14 18:11:15,801 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 255/255 [06:10<00:00,  1.45s/it]
2025-07-14 18:17:46,001 - BERTopic - Representation - Completed ✓
In [37]:
# Basic topic information
print(topic_model.get_topic_info())

# Keywords of a specific topic
print(topic_model.get_topic(0))

# Visualization
topic_model.visualize_topics()
     Topic  Count                                           Name  \
0       -1  35963                   -1_women_group_march_protest   
1        0   1942                  0_found_body_fatality_wrapped   
2        1   1410              1_conference_press_picketed_urged   
3        2   1310              2_meitei_meira_tribal_communities   
4        3    971            3_femicide_justice_demand_femicides   
..     ...    ...                                            ...   
250    249     41        249_hijab_religious_sociopolitical_veil   
251    250     41  250_coordinated_restrictions_abortions_mostly   
252    251     40          251_morality_infringing_iranian_hijab   
253    252     40              252_akoko_fulani_lga_pastoralists   
254    253     40                  253_femicide_case_town_recent   

                                        Representation  \
0    [women, group, march, protest, international, ...   
1    [found, body, fatality, wrapped, tied, reporte...   
2    [conference, press, picketed, urged, seoul, ko...   
3    [meitei, meira, tribal, communities, amidst, p...   
4    [femicide, justice, demand, femicides, case, f...   
..                                                 ...   
250  [hijab, religious, sociopolitical, veil, guida...   
251  [coordinated, restrictions, abortions, mostly,...   
252  [morality, infringing, iranian, hijab, died, c...   
253  [akoko, fulani, lga, pastoralists, ruler, comm...   
254  [femicide, case, town, recent, occurred, targe...   

                                               KeyBERT  \
0    [demonstrators, protesters, protested, protest...   
1    [strangled, corpse, decapitated, fatalities, d...   
2    [picketed, pickets, solidarity, kctu, committe...   
3    [protest, activists, militants, meitei, arrest...   
4    [victims, protested, protesters, protest, acti...   
..                                                 ...   
250  [hijabs, hijab, sharia, islamic, police, arres...   
251  [activists, protesting, abortions, rally, wome...   
252  [protesters, protestors, protests, hijab, iran...   
253  [protested, grievance, ekoko, akoko, attacks, ...   
254  [protest, protesting, activists, women, group,...   

                                                   MMR  \
0    [group, march, protest, day, woman, police, pr...   
1    [found, fatality, wrapped, colonia, torture, w...   
2    [conference, picketed, seoul, banners, harassm...   
3    [meitei, tribal, paibi, torch, apunba, state, ...   
4    [femicide, justice, cases, victims, feminist, ...   
..                                                 ...   
250  [hijab, sociopolitical, veil, forbidding, isla...   
251  [coordinated, restrictions, abortions, protest...   
252  [morality, iranian, hijab, rules, iranians, ve...   
253  [akoko, fulani, pastoralists, protested, axis,...   
254  [femicide, town, cases, protest, pozcu, incide...   

                                                OpenAI  \
0                [Protests and Violence Against Women]   
1    [Women's Homicides with Bodies Found Wrapped i...   
2    [Protests Against Sexual Harassment and Advoca...   
3    [Meira Paibi Protests Amidst Meitei-Tribal Con...   
4             [Protests for Justice in Femicide Cases]   
..                                                 ...   
250  [Enforcement of Hijab by Iranian Morality Poli...   
251    [Women's Protest Against Abortion Restrictions]   
252  [Protests Against Hijab Rules and Mahsa Amini'...   
253  [Protests Against Insecurity and Fulani Pastor...   
254      [Protests Against Femicide by Women's Groups]   

                                   Representative_Docs  
0    [On 8 March , in , , around 50 women from vari...  
1    [Around 14 February  (as reported), in , , a w...  
2    [On 16 November , members from the National  o...  
3    [On 11 August , local Meira Paibi (likely Meit...  
4    [On 16 January , in  (), 100 people, including...  
..                                                 ...  
250  [Other: On 18 June , Iranian Guidance Patrol p...  
251  [On 30 October , activists, mostly women, gath...  
252  [On 22 October , protestors, including Iranian...  
253  [On 15 January , hundreds of women from the Ak...  
254  [On 14 January , a women group staged a protes...  

[255 rows x 8 columns]
[('found', 0.066610601124112), ('body', 0.06484559077992424), ('fatality', 0.03838972406268369), ('wrapped', 0.03519847420814442), ('tied', 0.033365548692720605), ('reported', 0.03319917444221136), ('colonia', 0.031008692351572328), ('plastic', 0.026284319891301675), ('signs', 0.025845700064423587), ('killed', 0.024522933445648536)]
In [42]:
topic_model.get_topic_info()
Out[42]:
Topic Count Name Representation KeyBERT MMR OpenAI Representative_Docs
0 -1 35963 -1_women_group_march_protest [women, group, march, protest, international, ... [demonstrators, protesters, protested, protest... [group, march, protest, day, woman, police, pr... [Protests and Violence Against Women] [On 8 March , in , , around 50 women from vari...
1 0 1942 0_found_body_fatality_wrapped [found, body, fatality, wrapped, tied, reporte... [strangled, corpse, decapitated, fatalities, d... [found, fatality, wrapped, colonia, torture, w... [Women's Homicides with Bodies Found Wrapped i... [Around 14 February (as reported), in , , a w...
2 1 1410 1_conference_press_picketed_urged [conference, press, picketed, urged, seoul, ko... [picketed, pickets, solidarity, kctu, committe... [conference, picketed, seoul, banners, harassm... [Protests Against Sexual Harassment and Advoca... [On 16 November , members from the National o...
3 2 1310 2_meitei_meira_tribal_communities [meitei, meira, tribal, communities, amidst, p... [protest, activists, militants, meitei, arrest... [meitei, tribal, paibi, torch, apunba, state, ... [Meira Paibi Protests Amidst Meitei-Tribal Con... [On 11 August , local Meira Paibi (likely Meit...
4 3 971 3_femicide_justice_demand_femicides [femicide, justice, demand, femicides, case, f... [victims, protested, protesters, protest, acti... [femicide, justice, cases, victims, feminist, ... [Protests for Justice in Femicide Cases] [On 16 January , in (), 100 people, including...
... ... ... ... ... ... ... ... ...
250 249 41 249_hijab_religious_sociopolitical_veil [hijab, religious, sociopolitical, veil, guida... [hijabs, hijab, sharia, islamic, police, arres... [hijab, sociopolitical, veil, forbidding, isla... [Enforcement of Hijab by Iranian Morality Poli... [Other: On 18 June , Iranian Guidance Patrol p...
251 250 41 250_coordinated_restrictions_abortions_mostly [coordinated, restrictions, abortions, mostly,... [activists, protesting, abortions, rally, wome... [coordinated, restrictions, abortions, protest... [Women's Protest Against Abortion Restrictions] [On 30 October , activists, mostly women, gath...
252 251 40 251_morality_infringing_iranian_hijab [morality, infringing, iranian, hijab, died, c... [protesters, protestors, protests, hijab, iran... [morality, iranian, hijab, rules, iranians, ve... [Protests Against Hijab Rules and Mahsa Amini'... [On 22 October , protestors, including Iranian...
253 252 40 252_akoko_fulani_lga_pastoralists [akoko, fulani, lga, pastoralists, ruler, comm... [protested, grievance, ekoko, akoko, attacks, ... [akoko, fulani, pastoralists, protested, axis,... [Protests Against Insecurity and Fulani Pastor... [On 15 January , hundreds of women from the Ak...
254 253 40 253_femicide_case_town_recent [femicide, case, town, recent, occurred, targe... [protest, protesting, activists, women, group,... [femicide, town, cases, protest, pozcu, incide... [Protests Against Femicide by Women's Groups] [On 14 January , a women group staged a protes...

255 rows × 8 columns

In [44]:
openai_topics = topic_model.get_topics(full=True)["OpenAI"]

for topic_id, label_info in openai_topics.items():
    label = label_info[0][0].split("\n")[0]
    print(f"Topic {topic_id:>2}: {label}")
Topic -1: Protests and Violence Against Women
Topic  0: Women's Homicides with Bodies Found Wrapped in Blankets
Topic  1: Protests Against Sexual Harassment and Advocacy for Gender Equality in South Korea
Topic  2: Meira Paibi Protests Amidst Meitei-Tribal Conflict
Topic  3: Protests for Justice in Femicide Cases
Topic  4: Protests and Unrest Following Mahsa Amini's Death
Topic  5: Abortion Rights Protest against Supreme Court Decision to Overturn Roe v. Wade
Topic  6: Political Protests and Clashes in India Involving BJP, INC, AIMC Over Women's Safety and Governance Issues
Topic  7: Abductions and Ransom Demands in Conflict Zones
Topic  8: Anganwadi Workers' Protest for Regularisation and Salary Increase
Topic  9: Monthly Flower Demonstrations Supporting MeToo Movement Against Sexual Violence
Topic 10: Military Arrests and Detentions in Township Regions
Topic 11: Protests Against Gender-Based Violence on International Day
Topic 12: Abductions and Attacks by Presumed ISWAP or Boko Haram Militants
Topic 13: Comfort Women Protests in Front of Former Japanese Embassy in Seoul
Topic 14: Suppression of Female Petitioners and Human Rights Defenders
Topic 15: Global Protests for Abortion Legalization on September 28th
Topic 16: Gun Violence Awareness and Advocacy
Topic 17: Protests Against the Death of Mahsa Amini by Iranian Communities
Topic 18: Labor Protests and Worker Rights for Domestic and Health Workers
Topic 19: Drive-by Shootings by Armed Motorcyclists Targeting Women
Topic 20: Nationwide Protests Against Trump Administration Policies
Topic 21: Nationwide Protests Against Transphobic Legislation and Media
Topic 22: Women's Rights Activism in Sweden
Topic 23: ASHA Health Workers' Wage Protests in J&K
Topic 24: Protests Against Japanese Cabinet Over Article 9 Amendment
Topic 25: Witchcraft-related Attacks and Mob Violence in Villages
Topic 26: Student Protests for Justice for Mahsa Amini and Anti-Government Sentiments
Topic 27: Violence Against Transgender Women and Hate Crimes
Topic 28: Women's Protests Against Military Coup and Support for Civil Disobedience Movement
Topic 29: Protests Against Far-Right Extremism and AfD
Topic 30: Women Protesting Water Supply Shortages in J&K
Topic 31: Black Lives Matter Movement and Protests against Police Brutality
Topic 32: Taliban's Violations and Abuse Against Women
Topic 33: Ambazonian Separatists' Abductions and Violence Against Civilians
Topic 34: Sexual Violence by Military Personnel in African Regions
Topic 35: Drug Trafficking-Related Homicides of Women
Topic 36: Political Violence in West Bengal Elections
Topic 37: Mothers' Protests for Missing Relatives on Mother's Day
Topic 38: Armed Attacks on Women in Colonia Areas
Topic 39: Afghan Women's Protests Against Taliban Policies
Topic 40: Houthi-Sponsored Protests Against Israeli Actions in Solidarity with Palestinians
Topic 41: Extinction Rebellion and Climate Action Protests
Topic 42: Police Brutality Against Women
Topic 43: Protests Against Isolation and Visitor Ban of PKK Leader Abdullah Ocalan
Topic 44: Protests Against Gender-Based Violence in Universities
Topic 45: Protests for Social Support and Subsidized Housing for Low-Income Families in Kazakhstan
Topic 46: Red Dress Day and MMIW Awareness Events
Topic 47: Women's Rights Demonstrations Against Violence
Topic 48: Pro-Palestinian Protests for Ceasefire amid Israel-Hamas Conflict
Topic 49: Pro-Choice Protests Following Roe v. Wade Overturn
Topic 50: Women's March and Advocacy for Rights
Topic 51: Women's Strike Protests Against Abortion Restrictions
Topic 52: Military Violence and Arson in Villages
Topic 53: Mob Violence Against Women Suspected of Child Lifting
Topic 54: Protests Against Citizenship (Amendment) Act and NRC by Women
Topic 55: Israeli-Palestinian Settler Violence and Injuries
Topic 56: Parent Protests Over School Management and Conditions
Topic 57: Police Misconduct and Sexual Assault in Custody
Topic 58: Mysterious Deaths of Women with Burned Bodies Found
Topic 59: Protests Against Gender-Based Violence (GBV) During 16 Days of Activism
Topic 60: Ukrainian Protests Against Russian Invasion
Topic 61: Motorcycle Drive-by Shootings Targeting Women
Topic 62: Hijab Protests in Educational Institutions
Topic 63: Protests by Female College Students Over Hostel Conditions
Topic 64: Protests Against Sexual Harassment in Wrestling Federation
Topic 65: Sexual Assault and Violence by Imbonerakure Members
Topic 66: Houthi Sniper Attacks on Civilians in Yemen
Topic 67: Monthly Flower Demonstrations Against Sexual Violence Acquittals
Topic 68: Sexual Violence and Killings by Military Forces in Conflict Zones
Topic 69: National Strike and Demonstration Against Austerity and for Economic and Gender Equality
Topic 70: Sexual Violence and Attacks by Militias in Darfur
Topic 71: Military Police Officers' Families Protest Over Unpaid Salaries
Topic 72: Protests Against Bolsonaro and Gender-Based Violence in Brazil
Topic 73: Global Protests for Gender Equality on International Women's Day
Topic 74: Protests Against Dismissal of DEM Party Mayor Mehmet Siddik Akis
Topic 75: Protests Against Dismissal of HDP Mayors
Topic 76: Protests Against Netanyahu's Judicial Overhaul
Topic 77: Unidentified Armed Group Attacks on Women
Topic 78: Protests Against Farm Laws by SKM and Women Farmers
Topic 79: Student Protests Against School Handling of Sexual Misconduct
Topic 80: International Day for the Elimination of Violence against Women Protest Flash Mobs
Topic 81: Female Traders' Protests Against Market Changes
Topic 82: Abortion Rights Protests Addressing Roe v. Wade Overturn
Topic 83: Protests against Death of Kurdish Woman in Iranian Police Custody by Swedish-Iranian Community
Topic 84: Off-Duty Female Police Officer Shootings
Topic 85: Protests Against Rape and Murder of Woman Veterinary Doctor
Topic 86: Caste-Based Sexual Violence in Uttar Pradesh
Topic 87: Women's Protests for Political Prisoners' Immediate Release in May by February 14 Youth Coalition
Topic 88: Women's Protests Against Liquor Shops in Villages
Topic 89: International Women's Day Marches and Protests
Topic 90: Women's Strike Movement Protesting Abortion Law Restrictions in November
Topic 91: Women's Rights Activists Protests for Hostage Release
Topic 92: Saturday Mothers' Weekly Protests for Justice and Missing Relatives
Topic 93: Women's March Protesting Amy Coney Barrett's Supreme Court Nomination
Topic 94: Protests Against Turkish Military Operations in Kurdish Regions
Topic 95: Violent Attacks by Suspected Fulani Pastoralists on Farmers and Women
Topic 96: Women's Rights Demonstrations in Dalarna
Topic 97: Women's Rights and International Women's Day Protests
Topic 98: Women's Protests Against Abortion Restrictions
Topic 99: Demonstrations Opposing the Overturn of Roe v. Wade
Topic 100: Gang-Related Fatalities of Women
Topic 101: Protests Against Macron's Appointment of Barnier as Prime Minister
Topic 102: Women's Rights Rally Against Abortion Restrictions
Topic 103: Supreme Court and Senate Bill 8 Abortion Protests
Topic 104: Women's Rights Protest Against Abortion Restrictions in the UK
Topic 105: Protests Against Rape and Violence Against Women in Bangladesh
Topic 106: Turkish Government's Withdrawal from Women's Rights Convention
Topic 107: Sexual Violence by RSF Against Civilians
Topic 108: Detainees' Mothers Association Protests
Topic 109: University Fraternity Sexual Misconduct Protests
Topic 110: Alleged Poisoning and Mass Sociogenic Illness in Schools
Topic 111: Supreme Court Protests Against Leaked Draft to Overturn Roe v. Wade
Topic 112: Abortion Rights Protests in Response to Leaked Supreme Court Draft on Roe v. Wade
Topic 113: Women's Rights Protests Against Withdrawal from Violence Prevention Convention
Topic 114: Protests Against Smart Power Meters and Electricity Issues
Topic 115: Protests Against Hijab Enforcement in Iran
Topic 116: University Protests Against Police Violence and Detention Practices
Topic 117: Vanessa Guillen Vigil Protests and Justice Movement
Topic 118: Take Back the Night Marches Against Gender-Based Violence
Topic 119: Women's Strike Protests Against Abortion Restrictions
Topic 120: Nationwide Strikes and Demonstrations for Gender Equality on International Women's Day
Topic 121: Armed Attacks on Female Mayoral Candidates
Topic 122: Student Protests Against Leaked Supreme Court Draft on Abortion Rights
Topic 123: Abortion Rights and Anti-Abortion Protests
Topic 124: Protests by Saturday Mothers for Justice for Missing Detainees
Topic 125: Protests for Health Services for Ill Prisoners
Topic 126: Nationwide Flower Demonstrations Against Sexual Violence Acquittals
Topic 127: Violence Against Female Petitioners in China
Topic 128: QSD Detainment of Women in Countryside for Unknown Reasons
Topic 129: PKK-Affiliated Youth Movement Kidnapping Girls for Conscription in Countryside Areas
Topic 130: Murle-Lou Nuer Conflict and Violence
Topic 131: Pro-Choice Demonstrations in Response to Dobbs v. Jackson Draft Leak
Topic 132: International Women's Day Protests Against Gender Violence and Inequality
Topic 133: ADF Attacks on Civilians in Villages
Topic 134: Government Actions Against Ladies in White Protests
Topic 135: Women's Rights Demonstrations in Gothenburg by Kvinnostrejk Movement
Topic 136: Election Protests and Claims of Rigging by PDP Women
Topic 137: Demonstrations Supporting Rule of Law Against Far-Right Political Influence
Topic 138: Unsolved Fatal Shootings of Women by Unidentified Perpetrators
Topic 139: Breast Cancer Awareness and Healthcare Challenges
Topic 140: Women's Protests Against HTS for Detainee Release in Countryside
Topic 141: Nationwide Protests Against Inflation and Corruption on July 27th
Topic 142: Protest Against Gender-Based Violence and Child Disappearance
Topic 143: One Billion Rising Protest Against Gender Violence
Topic 144: Nepali Congress Anti-Government Demonstrations on September 24
Topic 145: Violence Involving FARC Dissidents and ELN in Rural Areas
Topic 146: Protests Against Violence and Femicides on International Day for the Elimination of Violence Against Women
Topic 147: Kurdish Newroz Celebrations and Political Demands
Topic 148: Mahsa Amini Mourning Observance Protests on University Campuses
Topic 149: Protests Over Luis Rubiales' Non-consensual Kiss in Women's World Cup
Topic 150: Korean Farmers' Advocacy and Government Policy Criticism
Topic 151: Sunni Baloch Protests Against Government Violence
Topic 152: International Women's Day Demonstrations for Gender Equality and Women's Rights
Topic 153: Protests Against Gender Violence on International Day for Elimination of Violence against Women
Topic 154: Women's Rights Demonstrations and Pension Reform Protests on International Women's Day
Topic 155: Al Shabaab Attacks on Women Accused of Government Collaboration
Topic 156: Sexual Violence and Murder by Unidentified Armed Groups in Conflict Zones
Topic 157: International Women's Day Protests Against Femicides and for Women's Rights
Topic 158: Women's Rights and Gender Violence Protest on International Women's Day
Topic 159: Feminist Protests Against Gender-Based Violence in November
Topic 160: Opposition to Alcohol Legalisation in Manipur
Topic 161: Houthi-Sponsored Protest in Solidarity with Palestinians Against Zionist Actions
Topic 162: Women's Abortion Rights Protests in October Coordinated by Women's Strike
Topic 163: International Women's Day Demonstrations for Gender Equality and Equal Pay
Topic 164: Advocacy for Amending Prostitution Punishment Laws
Topic 165: Women’s Protests Against Police Misconduct and Recruitment Practices
Topic 166: Protest Against Quran Burning by Far-Right Politician
Topic 167: Dahalo-Related Abductions and Attacks in Madagascar
Topic 168: Alleged School Poisonings and Mass Hysteria in Schools
Topic 169: Voting Rights and Capitol Riot Remembrance Events
Topic 170: Targeted Property Attacks and Vandalism Incidents
Topic 171: Women's Rights and Gender Equality in the Catholic Church
Topic 172: Protests for Justice and Prevention of Child Murders
Topic 173: Women's Strike Protests Against Abortion Restrictions
Topic 174: Gang Violence and Sexual Assault Amid Clashes in Commune Areas
Topic 175: Protests for Justice for Nirmala Pant Rape and Murder
Topic 176: Protest Against Valeri Simeonov by Mothers of Children with Disabilities
Topic 177: International Women's Day Protests Against Femicides
Topic 178: Protests Against Violence in Arab Israeli Community
Topic 179: Protests Supporting Palestinian Prisoners in Israeli Jails
Topic 180: Women's Strike Movement Protest Against Abortion Restrictions
Topic 181: Kidnappings and Ransom Demands in Nigeria
Topic 182: Alleged Poisoning of Female Students and Government Involvement Protest
Topic 183: RLD Protests Against Communal Violence and Demand for President's Rule
Topic 184: Women's March Against Gender-Based Violence
Topic 185: Protest Against Macron's Defense of Gerard Depardieu Amid Rape and Harassment Accusations
Topic 186: Houthi-Sponsored Protests in Solidarity with Palestine
Topic 187: International Women's Day Marches for Women's Rights
Topic 188: Drug Trafficking Tribunals and Victim Abductions
Topic 189: Nationwide Strike by Left Trade Unions in Protest Against Economic Policies
Topic 190: Advocacy for Comprehensive Legal Reform on Sexual Violence in Support of Gisele Pelicot
Topic 191: Protests Against Iranian Hijab Laws and Women's Rights支持
Topic 192: Protests Commemorating EDSA People Power Revolution and Opposing Charter Change in the Philippines
Topic 193: Solidarity Protests Against Indian Government's Actions in Kashmir
Topic 194: Taliban Restrictions on Women's Media and Public Presence
Topic 195: Student Protests Against Government Following Death in Police Custody
Topic 196: Meitei Community Protests and Government Response in Tribal Conflict
Topic 197: Houthi-Sponsored Demonstrations in Support of Palestinians and Against US-Zionist Actions
Topic 198: Women's Protest Against Abortion Restrictions
Topic 199: CODECO-URDPC Attacks on Civilians and Looting
Topic 200: Protest Against Attack on Jatiya Parishad's State President
Topic 201: Mob Justice and Police Intervention in Lynching Incidents
Topic 202: Detention and Re-education of Uyghur Women in China
Topic 203: Women's Equality Protest Rally
Topic 204: Protests Over Mahsa Amini's Death in Police Custody
Topic 205: Houthi-Sponsored Protest in Solidarity with Palestinians and Commemoration of Late President Saleh Ali Al Samad
Topic 206: Protests Against COVID-19 Maternity Regulations
Topic 207: Women's Strike Movement Protesting Abortion Restrictions
Topic 208: Protests and Demonstrations in Prisons Regarding Conditions and Visitation Rights
Topic 209: Women's Rights Protests for Gender Equality and Abortion Legalization
Topic 210: Abuse of Female Political Prisoners by Guards
Topic 211: Women's Rights and Abortion Restrictions Protest
Topic 212: Protests Against Violence Targeting Women by HDP and TJA Members
Topic 213: Vigilante-Style Attacks on Female Drug Suspects in the Philippines
Topic 214: Student Protests Against Rape and Violence in Universities
Topic 215: Detention of Women by Turkish and Syrian Intelligence
Topic 216: Political Protests Against Arrests in Pakistan
Topic 217: Female Teachers' Protests for Salary and Rights in Education Sector
Topic 218: Protests Against Violence Towards Women in Response to Pelicot Trial
Topic 219: Women's Rights and Anti-Patriarchy Protest Movement
Topic 220: Anti-Nuclear Protests and Commemoration Movement
Topic 221: Nationwide Protests Over Doctor's Rape-Murder Case
Topic 222: Protests Against NIA Case on Arambai Tenggol's Chief Amidst Meitei-Tribal Violence
Topic 223: Protests Against Femicide Commemorating Giulia Cecchettin
Topic 224: Abortion Rights Protests and Counter-Protests Involving Catholic Groups
Topic 225: Protests in Response to Death of Woman in Modesty Police Custody
Topic 226: Protest Against Government Apathy on Migrant Laborer Deaths During Pandemic
Topic 227: Protests Against Government Pressure on Independent Media and Foreign Ownership Restrictions
Topic 228: Militia Attacks and Abductions in Villages
Topic 229: Houthi-Sponsored Protest on Zayd ibn Ali's Death Anniversary
Topic 230: Polish Abortion Law Protests and Solidarity Movements
Topic 231: Violence Against Indigenous Women and Leaders
Topic 232: Demonstrations for Missing Persons Under State Detention
Topic 233: Nationwide Farmer Protest Against Farm Laws
Topic 234: Women's Protests Against Judicial Reforms Inspired by "The Handmaid's Tale"
Topic 235: Protests for Search of Remains of Missing Indigenous Women at Landfills
Topic 236: Protest Against Environmental Impact of Iron Sand Mining
Topic 237: Protests for Reinvestigation of Drug Haul Cases
Topic 238: Protests Against Sexual Violence in District (Country)
Topic 239: Anti-War Protests and Peace Advocacy by Codepink
Topic 240: Protest Against Agriculture Minister Pradeep Maharathy for Derogatory Remarks by BJP Mahila Morcha
Topic 241: International Women's Day Protests for Gender Equality and Against Gender-Based Violence
Topic 242: Houthi Protest Commemorating Tanumah Massacre and Supporting Palestinian Solidarity
Topic 243: Anti-Harassment Protests and Accountability in Media & Local Government
Topic 244: Barangay Leadership Attacks by Unidentified Assailants
Topic 245: Women's Rights Protests and Anti-Gender Violence Performances
Topic 246: Demonstrations on Bioethics Law and Assisted Procreation Rights
Topic 247: Protests Over Coronavirus Vaccine Availability and Distribution
Topic 248: Moroccan Protest Supporting Palestinian Women and Opposing Israeli Actions
Topic 249: Enforcement of Hijab by Iranian Morality Police and Sociopolitical Impact
Topic 250: Women's Protest Against Abortion Restrictions
Topic 251: Protests Against Hijab Rules and Mahsa Amini's Death
Topic 252: Protests Against Insecurity and Fulani Pastoralists in Akoko Region
Topic 253: Protests Against Femicide by Women's Groups
In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Convert only necessary columns to pandas, preserving order
df_plot = pl_df_clean.select(["country", "notes_clean"]).drop_nulls().to_pandas()

# Ensure the length matches the number of topics
assert len(df_plot) == len(topics), "❌ 'topics' length does not match the cleaned documents."

# Assign topics to cleaned documents
df_doc_topics = pd.DataFrame({
    "country": df_plot["country"].values,
    "topic": topics
})

# Count topics per country
topic_counts = df_doc_topics.groupby(["country", "topic"]).size().reset_index(name="count")

# Relative percentage per country
topic_counts["total_country"] = topic_counts.groupby("country")["count"].transform("sum")
topic_counts["percentage"] = 100 * topic_counts["count"] / topic_counts["total_country"]

# Get topic labels
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
labels_df = pd.DataFrame([
    {"topic": topic_id, "label": label_info[0][0].split("\n")[0]}
    for topic_id, label_info in openai_topics.items()
])

# Merge with labels
topic_counts["topic"] = topic_counts["topic"].astype(int)
labels_df["topic"] = labels_df["topic"].astype(int)
topic_counts = topic_counts.merge(labels_df, on="topic", how="left")

# Top-N topics per country
top_n = 5
top_topics_per_country = topic_counts.sort_values(["country", "percentage"], ascending=[True, False])\
                                     .groupby("country").head(top_n)

# Limit to only 3 countries
countries_to_plot = top_topics_per_country["country"].drop_duplicates().sort_values().head(3)
top_topics_per_country = top_topics_per_country[top_topics_per_country["country"].isin(countries_to_plot)]

# Plot with Seaborn
g = sns.FacetGrid(
    top_topics_per_country,
    col="country",
    col_wrap=3,
    sharey=False,
    height=4,
    aspect=1.5
)

g.map_dataframe(
    sns.barplot,
    x="percentage",
    y="label",
    palette="tab10"
)

g.set_titles(col_template="{col_name}")
g.set_axis_labels("Percentage (%)", "Topic")
for ax in g.axes.flatten():
    for label in ax.get_yticklabels():
        label.set_rotation(0)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [48]:
openai_labels = [
    label[0][0].split("\n")[0]
    for label in topic_model.get_topics(full=True)["OpenAI"].values()
]

topic_model.set_topic_labels(openai_labels)

topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Convert required columns from Polars
df_plot = pl_df_clean.select(["event_date", "notes_clean"]).drop_nulls().to_pandas()
df_plot["event_date"] = pd.to_datetime(df_plot["event_date"])

# Validate topic length
assert len(df_plot) == len(topics), "❌ Length of 'topics' does not match cleaned documents."

# Create DataFrame of topics per document
df_doc_topics = pd.DataFrame({
    "event_date": df_plot["event_date"].values,
    "topic": topics
})

# Add month column
df_doc_topics["month"] = df_doc_topics["event_date"].dt.to_period("M").dt.to_timestamp()

# Group by month and topic
monthly_topic_counts = df_doc_topics.groupby(["month", "topic"]).size().reset_index(name="count")

# Calculate percentage within each month
monthly_topic_counts["monthly_total"] = monthly_topic_counts.groupby("month")["count"].transform("sum")
monthly_topic_counts["percentage"] = 100 * monthly_topic_counts["count"] / monthly_topic_counts["monthly_total"]

# Get topic labels from the model
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
labels_df = pd.DataFrame([
    {"topic": topic_id, "label": label_info[0][0].split("\n")[0]}
    for topic_id, label_info in openai_topics.items()
])
labels_df["topic"] = labels_df["topic"].astype(int)
monthly_topic_counts["topic"] = monthly_topic_counts["topic"].astype(int)
monthly_topic_counts = monthly_topic_counts.merge(labels_df, on="topic", how="left")

# Select top-N global topics by volume
top_n = 5
top_topics = (
    monthly_topic_counts.groupby("topic")["count"]
    .sum()
    .nlargest(top_n)
    .index.tolist()
)
df_top = monthly_topic_counts[monthly_topic_counts["topic"].isin(top_topics)]

# Monthly percentage line plot
plt.figure(figsize=(12, 6))
ax = sns.lineplot(data=df_top, x="month", y="percentage", hue="label", marker="o")

plt.legend(title="Topic (OpenAI)", bbox_to_anchor=(1.02, 1), loc="upper left", borderaxespad=0)
plt.title(f"Monthly trend (% of total) of the top {top_n} most frequent topics")
plt.xlabel("Month")
plt.ylabel("Percentage of documents (%)")
plt.xticks(rotation=45)
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()
No description has been provided for this image
In [54]:
import itertools
import pandas as pd

# Define colors for the visualization to iterate over
colors = itertools.cycle(['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'])
color_key = {str(topic): next(colors) for topic in set(topic_model.topics_) if topic != -1}

# Prepare dataframe and ignore outliers
dfo = pd.DataFrame({"x": reduced_embeddings[:, 0], "y": reduced_embeddings[:, 1], "Topic": [str(t) for t in topic_model.topics_]})
dfo["Length"] = [len(doc) for doc in documents]
dfo = dfo.loc[dfo.Topic != "-1"]
dfo = dfo.loc[(dfo.y > -10) & (dfo.y < 10) & (dfo.x < 10) & (dfo.x > -10), :]
dfo["Topic"] = dfo["Topic"].astype("category")

# Get centroids of clusters
mean_df = dfo.groupby("Topic").mean().reset_index()
mean_df.Topic = mean_df.Topic.astype(int)
mean_df = mean_df.sort_values("Topic")
In [56]:
import seaborn as sns
from matplotlib import pyplot as plt
from adjustText import adjust_text
import matplotlib.patheffects as pe
import textwrap

fig = plt.figure(figsize=(20, 20))
sns.scatterplot(
    data=dfo,
    x='x',
    y='y',
    hue='Topic',
    palette=color_key,
    alpha=0.4,
    size='Length',
    sizes=(10, 200),
    legend=False
)

# Annotate top 50 topics
texts, xs, ys = [], [], []
for row in mean_df.iterrows():
  topic = row[1]["Topic"]
  name = textwrap.fill(topic_model.custom_labels_[int(topic)], 20)
  if int(topic) <= 50:
    xs.append(row[1]["x"])
    ys.append(row[1]["y"])
    texts.append(plt.text(row[1]["x"], row[1]["y"], name, size=10, ha="center", color=color_key[str(int(topic))],
                          path_effects=[pe.withStroke(linewidth=0.5, foreground="black")]
                          ))

# Adjust annotations such that they do not overlap
adjust_text(texts, x=xs, y=ys, time_lim=1, force_text=(0.01, 0.02), force_static=(0.01, 0.02), force_pull=(0.5, 0.5))
plt.axis('off')
plt.legend('', frameon=False)
plt.show()
No description has been provided for this image
In [58]:
# Extract required data
df_plot = pl_df_clean.select(["event_date", "country", "notes_clean"]).drop_nulls().to_pandas()
df_plot["event_date"] = pd.to_datetime(df_plot["event_date"])

# Validate length
assert len(df_plot) == len(topics), "❌ Length of 'topics' does not match the documents."

# Create base DataFrame
df_doc_topics = pd.DataFrame({
    "event_date": df_plot["event_date"].values,
    "country": df_plot["country"].values,
    "topic": topics
})

# Add month column
df_doc_topics["month"] = df_doc_topics["event_date"].dt.to_period("M").dt.to_timestamp()

# Group by month, country, and topic
grouped = df_doc_topics.groupby(["month", "country", "topic"]).size().reset_index(name="count")

# Calculate % per month and country
grouped["total"] = grouped.groupby(["month", "country"])["count"].transform("sum")
grouped["percentage"] = 100 * grouped["count"] / grouped["total"]

# Add OpenAI topic labels
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
labels_df = pd.DataFrame([
    {"topic": topic_id, "label": label_info[0][0].split("\n")[0]}
    for topic_id, label_info in openai_topics.items()
])
labels_df["topic"] = labels_df["topic"].astype(int)
grouped["topic"] = grouped["topic"].astype(int)
grouped = grouped.merge(labels_df, on="topic", how="left")

# Filter top-N agendas per country and month
top_n = 3
top_agendas = grouped.sort_values(["month", "country", "percentage"], ascending=[True, True, False])\
                     .groupby(["month", "country"]).head(top_n)

top_agendas.head()
Out[58]:
month country topic count total percentage label
0 2018-01-01 Afghanistan -1 1 3 33.333333 Protests and Violence Against Women
1 2018-01-01 Afghanistan 32 1 3 33.333333 Taliban's Violations and Abuse Against Women
2 2018-01-01 Afghanistan 77 1 3 33.333333 Unidentified Armed Group Attacks on Women
3 2018-01-01 Albania -1 1 1 100.000000 Protests and Violence Against Women
4 2018-01-01 Argentina -1 4 5 80.000000 Protests and Violence Against Women
In [60]:
# Ensure groupe' is created
april_2025 = grouped[grouped["month"] == "2025-04"]

# Sort by country and descending percentage
april_2025 = april_2025.sort_values(["country", "percentage"], ascending=[True, False])

# View top-N agendas per country (e.g., 3 per country)
top_n = 3
april_top = april_2025.groupby("country").head(top_n)

april_top[["country", "topic", "label", "percentage"]]
Out[60]:
country topic label percentage
18762 Algeria -1 Protests and Violence Against Women 50.000000
18763 Algeria 87 Women's Protests for Political Prisoners' Imme... 50.000000
18764 Angola 78 Protests Against Farm Laws by SKM and Women Fa... 100.000000
18765 Argentina 3 Protests for Justice in Femicide Cases 100.000000
18766 Australia 41 Extinction Rebellion and Climate Action Protests 100.000000
... ... ... ... ...
18938 Uzbekistan 78 Protests Against Farm Laws by SKM and Women Fa... 100.000000
18939 Venezuela -1 Protests and Violence Against Women 100.000000
18945 Yemen 186 Houthi-Sponsored Protests in Solidarity with P... 38.095238
18940 Yemen -1 Protests and Violence Against Women 28.571429
18944 Yemen 161 Houthi-Sponsored Protest in Solidarity with Pa... 14.285714

145 rows × 4 columns

In [62]:
april_top.to_csv("Output/Agendas_April25.csv", index=False)
In [64]:
# Monthly Classification 
In [66]:
!pip install sentence-transformers
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Requirement already satisfied: sentence-transformers in /opt/anaconda3/lib/python3.11/site-packages (5.0.0)
Requirement already satisfied: transformers<5.0.0,>=4.41.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.53.2)
Requirement already satisfied: tqdm in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.65.0)
Requirement already satisfied: torch>=1.11.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (2.2.2)
Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.6.1)
Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.11.4)
Requirement already satisfied: huggingface-hub>=0.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (0.33.4)
Requirement already satisfied: Pillow in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (10.2.0)
Requirement already satisfied: typing_extensions>=4.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.14.0)
Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (3.13.1)
Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2023.6.0)
Requirement already satisfied: packaging>=20.9 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (23.1)
Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.1)
Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3)
Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (1.1.5)
Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (1.12)
Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1)
Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.3)
Requirement already satisfied: numpy>=1.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (1.26.4)
Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (2023.10.3)
Requirement already satisfied: tokenizers<0.22,>=0.21 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.21.2)
Requirement already satisfied: safetensors>=0.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.5.2)
Requirement already satisfied: joblib>=1.2.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (1.2.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (3.5.0)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (2.1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2025.4.26)
Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=1.11.0->sentence-transformers) (1.3.0)
In [89]:
import pandas as pd

# Load the original file
df_women = pd.read_csv("Input/Jun25.csv")

# Apply the topic model to the 'notes' column
documents = df_women["notes"].astype(str).tolist()
topics, probs = topic_model.transform(documents)

# Add topic results to the original DataFrame
df_women["topic"] = topics
df_women["probability"] = probs

# Get and clean topic labels
raw_labels = topic_model.get_topic_info()[["Topic", "Name"]].set_index("Topic")["Name"].to_dict()
clean_labels = {
    topic_id: (
        label.split("_", 1)[1] if topic_id != -1 and "_" in label else "Unassigned"
    )
    for topic_id, label in raw_labels.items()
}
df_women["topic_label"] = df_women["topic"].map(clean_labels)
Batches:   0%|          | 0/403 [00:00<?, ?it/s]
2025-07-14 18:32:25,256 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-07-14 18:32:35,652 - BERTopic - Dimensionality - Completed ✓
2025-07-14 18:32:35,653 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-07-14 18:32:36,337 - BERTopic - Cluster - Completed ✓
In [91]:
# Filter out rows with topic = -1 (unassigned)
df_women_filtered = df_women[df_women["topic"] != -1].copy()

# Check a sample
print(df_women_filtered[["country", "topic", "topic_label", "probability"]].sample(5))
             country  topic                           topic_label  probability
7323   United States     20                trump_musk_elon_donald     0.802337
9384           Sudan      4             amini_mahsa_rioters_death     0.645559
10256           Iran     85      veterinary_doctor_murder_awarded     0.347326
292            Nepal      8  anganwadi_workers_helpers_assistants     0.452108
4033   United States     21         transgender_lgbtq_pride_trans     1.000000
In [93]:
# Save cleaned file
df_women_filtered[["country", "topic", "topic_label", "probability"]].to_csv("Output/Jun25.csv", index=False)